Required Packages

library(tidyverse)
library(RColorBrewer)
library(readxl)
library(minfi)
library(ggplot2)
library(ENmix)
library(stringr)
library(geneplotter)
library(ChAMP)
library(wateRmelon)
library(sva)
library(pdftools)
library(filesstrings)
library(Harman)
library(DMRcate)

knitr::opts_chunk$set(echo = FALSE, warning = FALSE, message = FALSE)

Prepare and Load Data

Metadata Preparation

First, we load the buccal cell sample metadata. Note that the orignal sample metadata had a number of errors: there was a duplication of sample IDs.

# Get the sample info. Note that the top-level file for this R project is a
# clone of the "Superfund-Methylatio" GitHub repository. The data is stored in a
# copy of the "Arsenic Epigeenetics META" repository, locally named
# "arsenic-epigenetics-meta".
sample_metadata <- read_xlsx(
  path = paste0(
    "../../arsenic-epigenetics-meta/Chile/20170212_AC2013_DNAmethArray_Buccal",
    "_PBMCs_MetaData.xlsx"),
  sheet = "AC2013_DNAmeth_PL2_Buccal"
) %>%
  mutate(
    position = `Position (on FINAL PLATE)`,
    sample_id = `Sample ID`,
    buccal_sample_id = `BUCCAL Sample ID`,
    study_id = `Study_ID...5`,
    exposed = as.factor(if_else(`Exposed: Yes:1; No:2` == 1, "yes", "no")),
    age = P3Q8_Age,
    sex = as.factor(if_else(`P3Q9_Sex_(M=1,F=2)` == 1, "M", "F")),
    smoking = as.factor(if_else(P38Q96_Smoking == 1, "yes", "no")),
    secondhand_smoking = as.factor(
      if_else(P40Q116_Smoking_Secondhand_for_non_smokers == "1", "yes", "no")
    ),
    diabetes = as.factor(if_else(P11Q35_Diabetes == 1, "yes", "no")),
    skin_cancer = as.factor(if_else(P10Q33_Cancer_Skin == 1, "yes", "no")),
    hypertension = as.factor(if_else(P12Q37_Hypertension == 1, "yes", "no")),
    cardio = as.factor(if_else(P13Q38_Cardio == 1, "yes", "no")),
    breastfed = as.factor(if_else(P26Q63_Breastfed == "1", "yes", "no")),
    gastric = as.factor(if_else(P16Q43_Gastric == 1, "yes", "no")),
    tuberculosis = as.factor(if_else(P15Q42_Tuberculosis == 1, "yes", "no")),
    intestinal = as.factor(if_else(P17Q45_Intestinal == 1, "yes", "no")),
    childhood_diseases = as.factor(
      if_else(P20Q50_Childhood_diseases == 1, "yes", "no")
    ),
    anemia = as.factor(if_else(P18Q47_Anemia == 1, "yes", "no")),
    immune = as.factor(if_else(P19Q48_Immune == 1, "yes", "no")),
    antibiotics = as.factor(
      if_else(`P22Q54_Antibiotics_(Yes=1,No=2)` == 1, "yes", "no")
    ),
    immunosuppresor = as.factor(
      if_else(`P24Q58_Immuosupresores_(Yes=1,No=2)` == 1, "yes", "no")
    ),
    laxatives = as.factor(
      if_else(`P25Q62_laxatives_(Yes=1,No=2)` == 1, "yes", "no")
    ),
    med_high_triglycerides = as.factor(
      if_else(Med_High_triglycerides == 1, "yes", "no")
    ),
    med_insulin_resistance = as.factor(
      if_else(Med_Insulin_resistance == 1, "yes", "no")
    ),
    med_hypertension = as.factor(if_else(Med_Hypertension == 1, "yes", "no")),
    med_diabetes = as.factor(if_else(Med_Diabetes == 1, "yes", "no")),
    med_high_cholesterol = as.factor(
      if_else(Med_High_Cholesterol == 1, "yes", "no")
    ),
    med_anticoagulant = as.factor(if_else(Med_anticoagulant == 1, "yes", "no")),
    weight_20 = P4Q12a_Weight20,
    weight_40 = P4Q12b_Weight40
  ) %>%
  dplyr::select(
    position, sample_id, buccal_sample_id, study_id, exposed, age, sex, smoking,
    secondhand_smoking, diabetes, skin_cancer, hypertension, cardio, breastfed,
    gastric, tuberculosis, intestinal, childhood_diseases, anemia, immune,
    antibiotics, immunosuppresor, laxatives, med_high_triglycerides,
    med_insulin_resistance, med_hypertension, med_diabetes,
    med_high_cholesterol, med_anticoagulant, weight_20, weight_40
  ) %>%
  filter(
    sample_id != "QC", buccal_sample_id != "BLANK"
  )

# assign duplicates
sample_metadata <- sample_metadata %>%
  mutate(
    duplicates = if_else(sample_id %in% c("B1", "B36"), 1,
                   if_else(sample_id %in% c("B16", "B34"), 2,
                     if_else(sample_id %in% c("B2", "B5"), 3,
                       if_else(sample_id %in% c("B7", "B41"), 4, 0))))
  )

# now load the sentrix id and position data
sample_info <- read_csv(
  "../../arsenic-epigenetics-meta/Chile/Sample_Info-Berkeley.csv"
) %>%
  dplyr::select(
    Sample.ID, Sample_Well, Sentrix_ID, Sentrix_Position
  ) %>%
  mutate(
    sentrix_full = paste(Sentrix_ID, Sentrix_Position, sep = "_"),
    sample_id = Sample.ID,
    position = Sample_Well
  ) %>%
  dplyr::select(
    sample_id, position, sentrix_full
  ) %>%
  filter(
    sample_id %in% paste0("B", 1:47)
  )
 
# create a vector of idat filenames
sample_info$Basename <- paste0(
  paste0("/Users/philippe/Documents/berkeley/",
         "superfund/arsenic-epigenetics-meta/Chile/idat/"),
  sample_info$sentrix_full
)

# include the base idat filenames to the sample metadata
sample_info <- sample_info %>%
  dplyr::select(position, Basename)
sample_metadata <- sample_metadata %>%
  left_join(sample_info, by = "position")

With the meta data completed, we now remove the duplicates from the data, retaining only the first sample in order of sample ID from each individual. The duplicated samples are listed below:

  • Participant AC130004: B1 and B36
  • Participant AC130037: B16 and B34
  • Participant AC130044: B2 and B5
  • Participant AC130079: B7 and B41
keep <- !(sample_metadata$sample_id %in% c("B36", "B34", "B5", "B41"))
sample_metadata <- sample_metadata[keep, ]
rm(sample_info, keep)

Create RGChannelSetExtended Object

Next, we create or load the RGChannelSetExtended object, depending on whether or not it’s been generated locally in the past.

# if the RGset already exists, run this code chunk instead of the one above
load("../../arsenic-epigenetics-meta/Chile/buccal/data/buccal_RGset.RData")
pheno <- pData(RGset)
sample_name <- rownames(pheno)
pheno <- data.frame(pheno, sample_name)

# adding Sentrix ID for subsequent batch correction in downstream analysis
sentrix_id <- as.factor(
  stringr::str_extract_all(pheno$sample_name, "^([0-9]{12})",
                           simplify = TRUE)
)
pheno <- data.frame(pheno, sentrix_id)

# remove features without any variation
pheno <- pheno[, apply(pheno, 2, function(x) length(unique(x)) > 1)]

Internal Quality Control

First, we check that the signal to noise ratio of each sample. Sample B14’s detection p-values are significantly larger than the rest, and so we remove it from consideration.

# check detection p-values
det_p <- detectionP(RGset)
pal <- brewer.pal(8, "Dark2")
barplot(
  colMeans(det_p), col = pal[factor(pheno$sample_id)], las = 2,
  cex.names = 0.8, ylab = "Mean detection p-values",
  names.arg = pheno$sample_id
)

# remove B14
RGset <- RGset[, -26]
pheno <- pData(RGset)

det_p <- detectionP(RGset)
pal <- brewer.pal(8, "Dark2")
barplot(
  colMeans(det_p), col = pal[factor(pheno$sample_id)], las = 2,
  cex.names = 0.8, ylab = "Mean detection p-values",
  names.arg = pheno$sample_id
)

We then produce a PDF version QC report for Illumina Infinium Human Methylation 450k arrays, which is useful for identifying failed samples.

Outlier information

Pre-Filtering

A number of functions are run sequentially on the RGset object. First, outliers are identified based on their median unmethylated/methylated intensity.

Next, champ.QC is used to perform a brief exploratory data analysis. The resulting plots are listed below.

# Create pdf of quality control plots: mdsplot, densityPlot, dendrogram, et.c
champ.QC(
  beta = getBeta(Mset), pheno = pheno$exposed, mdsPlot = TRUE,
  densityPlot = TRUE, dendrogram = TRUE, PDFplot = TRUE, Rplot = FALSE,
  Feature.sel = "None", resultsDir = "preprocessing-reports/QC_preFiltering"
)

Pre-Filtering Beta Values

Pre-Filtering MDS Plot

Pre-Filtering Dendogram

Pre-Filtering SVD

# remove uninteresting variables from SVD
rm_svd <- c("position", "sample_id", "buccal_sample_id", "study_id",
            "duplicates", "Basename", "filenames", "sample_name", "sentrix_id",
            "xMed", "yMed", "predictedSex")

# run SVD
champ.SVD(
  beta = getBeta(Mset)[complete.cases(getBeta(Mset)), ],
  pd = pheno[, -which(colnames(pheno) %in% rm_svd)],
  RGEffect = TRUE, Rplot = FALSE,
  resultsDir = "preprocessing-reports/QC_preFiltering/"
)

Post Filtering

Finally, quantitiative measures of data quality are computed, including number of beads, averaged bisulfite conversion intensity, and detection p-values of probes. Low quality samples and probes are identified, as well as the outliers based on total intensity and beta value distributions. The results are listed below.

# compute QC metrics
qc <- ENmix::QCinfo(RGset, distplot = FALSE)
## 0  samples with percentage of low quality CpG value greater than  0.05  or bisulfite intensity less than  8212.242 
## 32334  CpGs with percentage of low quality value greater than  0.05 
## Ploting qc_sample.jpg ...
## Done
## Ploting qc_CpG.jpg ...
## Done
## Identifying ourlier samples based on beta or total intensity values...
## After excluding low quality samples and CpGs
## 0  samples are outliers based on averaged total intensity value 
## 1  samples are outliers in beta value distribution 
## 1  outlier samples were added into badsample list
# beta: percentage of metylation
# M: log2 ratio of intensities of methylated vs. unmethylated
betas <- rmSNPandCH(
  getBeta(Mset), dist = 2, mafcut = 0.05, and = TRUE, rmcrosshyb = TRUE,
  rmXY = FALSE
)
mvals <- beta2m(betas)

# removing filtered probes from qc
qc$nbead <- qc$nbead[rownames(qc$nbead) %in% rownames(betas), ]
qc$detP <- qc$detP[rownames(qc$detP) %in% rownames(betas), ]

filtered <- champ.filter(
  beta = betas, M = mvals,
  pd = pheno, beadcount = qc$nbead, detP = qc$detP,
  arraytype = "EPIC", filterXY = TRUE, SampleCutoff = 0.05
)
##                     Failed CpG Fraction.
## 200607110070_R01C01         0.0009318231
## 200607110071_R01C01         0.0024406071
## 200627670015_R01C01         0.0031379235
## 200650230008_R01C01         0.0124164492
## 200650230021_R01C01         0.0015063025
## 200607110070_R02C01         0.0013201861
## 200607110071_R02C01         0.0015348404
## 200627670015_R02C01         0.0026019081
## 200650230008_R02C01         0.0015236734
## 200650230021_R02C01         0.0009144522
## 200607110070_R03C01         0.0009640832
## 200607110071_R03C01         0.0018227005
## 200627670015_R03C01         0.0010894017
## 200650230008_R03C01         0.0008499318
## 200650230049_R03C01         0.0007395027
## 200607110070_R04C01         0.0013971142
## 200607110071_R04C01         0.0031850730
## 200627670015_R04C01         0.0020237063
## 200650230021_R04C01         0.0007084833
## 200650230049_R04C01         0.0007531513
## 200607110071_R05C01         0.0010000658
## 200627670015_R05C01         0.0006576115
## 200650230008_R05C01         0.0005533863
## 200650230049_R05C01         0.0005906096
## 200607110070_R06C01         0.0011005686
## 200627670015_R06C01         0.0022755839
## 200650230008_R06C01         0.0007221319
## 200650230021_R06C01         0.0009541570
## 200650230049_R06C01         0.0004938290
## 200607110070_R07C01         0.0011514405
## 200607110071_R07C01         0.0018053296
## 200650230008_R07C01         0.0008722658
## 200650230021_R07C01         0.0004342717
## 200650230049_R07C01         0.0006737416
## 200607110070_R08C01         0.0029505662
## 200607110071_R08C01         0.0078417067
## 200627670015_R08C01         0.0026105935
## 200650230008_R08C01         0.0016787704
## 200650230021_R08C01         0.0010993279

We next perform the same exploratory data analysis as before, but with the low-quality probes removed.

# filter sample and probes
pheno <- filtered$pd
betas <- filtered$beta
mvals <- filtered$M

filtered_CpG <- rownames(betas)
unfiltered_CpG <- rownames(getBeta(Mset))
outCpG <- unfiltered_CpG[!(unfiltered_CpG %in% filtered_CpG)]
  
# recompute the density plots
champ.QC(
  beta = betas, pheno = pheno$exposed, mdsPlot = TRUE,
  densityPlot = TRUE, dendrogram = TRUE, PDFplot = TRUE, Rplot = FALSE,
  Feature.sel = "None",
  resultsDir = "preprocessing-reports/QC_postFiltering"
)

# save the csv
write.csv(
  pheno,
  file = "../../arsenic-epigenetics-meta/Chile/buccal/data/buccal_pheno.csv"
)

Post-Filtering Beta Values

Post-Filtering MDS Plot

Post-Filtering Dendogram

Post-Filtering SVD

# run SVD
champ.SVD(
  beta = betas,
  pd = pheno[, -which(colnames(pheno) %in% rm_svd)],
  RGEffect = TRUE, Rplot = FALSE,
  resultsDir = "preprocessing-reports/QC_postFiltering/"
)

Normalization

We now normalize the data, and re-perform the exploratory data analysis to validate that the quality control measures were successful.

Funnorm Normalization

funnorm <- preprocessFunnorm(RGset)
betas_fun <- minfi::getBeta(funnorm)
betas_fun <- betas_fun[rownames(betas_fun) %in% rownames(filtered$beta), ]
mvals_fun <- B2M(betas_fun)
pheno <- pData(funnorm)

# save the processed beta and mvals, as well as pheno data
save(
  betas_fun, mvals_fun, pheno,
  file = "../../arsenic-epigenetics-meta/Chile/buccal/data/buccal_funnorm_data.RData"
)
# perform EDA
champ.QC(
  beta = betas_fun, pheno = pheno$exposed, mdsPlot = TRUE, densityPlot = TRUE,
  dendrogram = TRUE, PDFplot = TRUE, Rplot = FALSE, Feature.sel = "None",
  resultsDir = "preprocessing-reports/QC_postNormalization/"
)

champ.SVD(
  beta = betas_fun, pd = pheno[, -which(colnames(pheno) %in% rm_svd)],
  RGEffect = TRUE, Rplot = FALSE,
  resultsDir = "preprocessing-reports/QC_postNormalization/"
)

Funnorm Normalization Betas Distribution

Funnorm Normalization MDS Plot

Funnorm Normalization Dendogram

SVD Results

Estimate Cell Counts

We now estimate the cell-type decomposition of each sample, and plot their distributions. The cell types are estimated via ReFACTor, a method based on sparse principal componenent analysis and designed for cell type heterogeneity in EWAS. It provides the estimates of cell type composisition. The bash script for generating these counts is provided below. For installation instructions, see the ReFACTor GitHub repo.

# create betas tsv
betaRefact <- cbind(rownames(betas_fun), data.frame(betas_fun))
colnames(betaRefact) <- str_sub(colnames(betaRefact), start = 2L)
colnames(betaRefact)[1] <- "ID"
write.table(betaRefact, file = 'preprocessing-reports/betas_ReFACTor.txt',
            sep = "\t", col.names = TRUE, row.names = FALSE)

# create pheno tsv
pheno$ID <- rownames(pheno)
subset_pheno <- pheno[, c("ID", "sex", "smoking")]
subset_pheno <- subset_pheno %>%
  as_tibble %>%
  mutate(
    sex = if_else(sex == "M", 0, 1),
    smoking = if_else(smoking == "yes", 1, 0)
  )
write.table(subset_pheno, file = "preprocessing-reports/pheno_ReFACTor.txt",
            sep = "\t", col.names = FALSE, row.names = FALSE)
# make sure that you're in the right directory
cd  superfund/Superfund-Methylation/Arsenic-buccal/preprocessing-reports

# estimate the number of sPCs to use (note that we can't take into account the
# effect of controlling for smoking and sex).
python estimate_k.py --datafile betas_ReFACTor.txt

# run ReFACTor (might need to run w/ pythonw if using conda)
python refactor.py --datafile betas_ReFACTor.txt --k 6 \
  --covarfile pheno_ReFACTor.txt \
  --out preprocessing-reports/refactor_buccal